This notebook explores the sales data of 45 Wal-Mart locations. Exploritory data analysis was perfomed at the store and deparment level and the finding were used to forecast the sales.
Functionality Setting
import warnings
warnings.filterwarnings("ignore")
Dependencies
import matplotlib.pyplot as plt
#%matplotlib inline
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.io as pio
pio.renderers.default='notebook'
import seaborn as sns
import pandas as pd
import numpy as np
import datetime
The sales and store data was collected from Walmart Recruiting's Kaggle Competition. This dataset only includes the influence of 4 holidays. Entering this project with the assumption that consumer spending is heavily influenced by holidays so a second source was utilized to collect all relevant consumer events and commonly observed U.S. holidays. The datasets were then consolidated in a separate script (due to its extended runtime) and saved in sales_updated.csv.
csv = pd.read_csv("data/holidays.csv")
holidays = pd.DataFrame(csv)
holidays.head()
holidays.Date = pd.to_datetime(holidays.Date)
#walmart's
csv = pd.read_csv("data/sales.csv")
sales = pd.DataFrame(csv)
sales.tail()
sales.Date = pd.to_datetime(sales.Date)
#all
csv = pd.read_csv("data/sales_updated.csv")
sales_all = pd.DataFrame(csv)
sales_all.tail()
sales_all.Date = pd.to_datetime(sales.Date)
sales.info()
sales.describe()
csv = pd.read_csv("data/stores.csv")
stores = pd.DataFrame(csv)
stores.tail()
stores.info()
stores.describe()
#setting color pallete for charts
pio.templates.default = "plotly_white"
palette = flatui = ["#9b59b6", "#3498db", "#95a5a6", "#e74c3c", "#34495e", "#2ecc71"]
sns.set_palette(palette)
plt.style.use('seaborn-darkgrid')
Review missing values
sales.nunique()
45*81*143 == len(sales)
#Departments with missing weekly sales
sales.groupby('Dept').nunique().query("Date<143")
#Stores with missing weekly sales
sales.groupby('Store').nunique().query("Date<143")
#Number of unique depts per store
sales.groupby('Store').nunique().sort_values(['Dept']).plot()
High-level analysis of sales and holiday's potential influence on seasonality
byDate = sales.groupby('Date',sort=False).sum()
byDate.IsHoliday = byDate.IsHoliday.astype('bool').astype('int')
byDate.index.freq = 'W-FRI'
byDate.head()
holidaysdf = byDate.query("IsHoliday==1")
line = go.Figure(data=go.Scatter(x=byDate.index,y=byDate.Weekly_Sales, name='Weekly_Sales'))
line.update_layout(title={
'text': "Total Weekly Sales",
'y':0.9,
'x':0.5,
'xanchor': 'center',
'yanchor': 'top'}
)
line.update_xaxes(
rangeslider_visible=True,
rangeselector=dict(
buttons=list([
dict(count=1, label="1m", step="month", stepmode="backward"),
dict(count=6, label="6m", step="month", stepmode="backward"),
dict(count=1, label="1y", step="year", stepmode="backward"),
dict(step="all")
])
)
)
line.show()
ymin=byDate.Weekly_Sales.min()
ymax=byDate.Weekly_Sales.max()
lin = go.Figure(data=go.Scatter(x=byDate.index,y=byDate.Weekly_Sales, name='Weekly_Sales'))
final_wk = max(byDate.index)
for date in byDate.query("IsHoliday==1").index:
lin.add_shape(type="line",
x0=date,
y0=ymin,
x1=date,
y1=ymax,
opacity=0.3,
line=dict(color="RoyalBlue"))
lin.update_layout(title={
'text': "Total Weekly Sales with Walmart's Holiday Markers",
'y':0.9,
'x':0.5,
'xanchor': 'center',
'yanchor': 'top'})
lin.show()
lin = go.Figure(data=go.Scatter(x=byDate.index,y=byDate.Weekly_Sales, name='Weekly_Sales'))
final_wk = max(byDate.index)
for index, row in holidays[holidays.Date <= final_wk].iterrows():
lin.add_shape(type="line",
x0=row['Date'],
y0=ymin,
x1=row['Date'],
y1=ymax,
opacity=0.3,
line=dict(color="RoyalBlue"))
lin.update_layout(title={
'text': "Total Weekly Sales with All Holiday Markers",
'y':0.9,
'x':0.5,
'xanchor': 'center',
'yanchor': 'top'})
lin.show()
Day-of and trailing/leading
How does the stores' size/type influences sales?
byStore = sales.reset_index().groupby('Store', as_index=False).sum()
byStore = pd.merge(byStore, stores, on='Store', how='left')
byStore.head()
byStore.nunique()
#Formatting df for visuals
byStore = byStore[["Store", "Weekly_Sales", "Type", "Size"]]
byStore.head()
byStore_Bar = go.Figure(data=go.Bar(x=byStore.Store,
y=byStore.Weekly_Sales,
name='Weekly_Sales')
)
byStore_Bar.update_layout(title={
'text': "Total Weekly Sales with All Holiday Markers",
'y':0.9,
'x':0.5,
'xanchor': 'center',
'yanchor': 'top'})
byStore_Bar.show()
SOME STORE THAT OUTPERFORM OTHERS...POTENTIALLY INFLUENCED BY Type SIZE LOCATION ETC...
fig = px.box(byStore, x="Type", y="Weekly_Sales", color="Type")
fig.show()
fig = px.box(byStore, x="Type", y="Size", color="Type")
fig.show()
size_Dist = px.histogram(byStore, x="Size", color='Type')
size_Dist.show()
fig = px.scatter(byStore, x="Size",y='Weekly_Sales',color='Type')
fig.show()
byStore['SalesPerSF'] = byStore.Weekly_Sales/byStore.Size
fig = px.scatter(byStore, x="Size",y='SalesPerSF',color='Type')
fig.show()
types seem to be based soley on size....smaller store making best use of space...larger store appear to be average ....medium have lowest performance
...type influenced by size...bc previous does not provided the desired detail...dig deeper into size..cluster to further analyze the sales behavior across stores SIZES...
Cluster Stores in hope to find stronger relationships...use ml to cluter store based on size
byStorex = byStore[['SalesPerSF','Weekly_Sales', 'Size']]
byStorex.head()
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=6)
kmeans.fit(byStorex)
byStorex['Cluster'] = kmeans.labels_
byStorex.head()
NOW THAT THE STORE ARE CLUSTERED...MORE MEANINGFUL inferences...size's influence on sales
fig = px.scatter(byStorex, x="Size",y='SalesPerSF',color='Cluster', size='Weekly_Sales')
fig.show()
fig = px.box(byStorex, x="Cluster", y="SalesPerSF", color="Cluster")
fig.show()
FINAL STORE SALES THOUGHTS...
Which departments are influenced by seasonality?
#number of unique departments
sales['Dept'].nunique()
byDept = sales.groupby(['Dept']).sum()
byDept.head()
byDept_Bar = px.bar(byDept, y='Weekly_Sales')
byDept_Bar.show()
as expected prices vary very heavily among depts...negative sales (loss) observed...filter store with negative sales
byDept.query('Weekly_Sales<=0')
determine stationarity...lags!!
from statsmodels.tsa.stattools import adfuller
#salesx = pd.read_csv("data/sales.csv")
#salesx.head()
#median number of records for depts with NO missing week
sales.groupby('Dept').nunique().sort_values('Date').query('Date==143')['Weekly_Sales'].median()
sales.Date.nunique()
#Depts with missing weeks
dept_w_missing_wks = sales.groupby('Dept').nunique().sort_values('Date').query('Date<143')
missing_wks = dept_w_missing_wks.index
for dept in missing_wks:
data = sales.query(f"Dept=={dept}").groupby('Date').sum()
fig = px.line(data, y="Weekly_Sales")
fig.show()
stat_list = []
stat_bool_list = []
dept_list = []
depts = sales.Dept.unique()
for dept in depts:
if dept in missing_wks:
continue
byDeptx = sales.query(f"Dept=={dept}").groupby('Date').sum()
dftest = adfuller(byDeptx['Weekly_Sales'], maxlag=55)
p_value = dftest[1]
lags = dftest[2]
if p_value <= 0.05:
stationarity = "Stationary"
station_bool = 1
else:
stationarity = "Non-Stationary"
station_bool = 0
dept_list.append(dept)
stat_list.append(stationarity)
stat_bool_list.append(station_bool)
stationarities = pd.DataFrame({'Dept': dept_list,
'Stationary': stat_list,
'Stationary_Bool': stat_bool_list})
stationarities.head()
STATIONARY VS NON-STATIONARY...COUNT, %OF DEPT, %OF SALES DEPEND ON SEASONALITY/TRENDS??
byStat = pd.merge(sales, stationarities, on='Dept', how='left')
byStat = byStat.groupby(['Stationary','Date'], as_index=False).sum()
byStat.head()
byStat_line = px.line(byStat,x='Date', y='Weekly_Sales', color='Stationary')
byStat_line.show()
fig = px.pie(byStat, values='Weekly_Sales', names='Stationary')
fig.show()
describe methodology..stationarity/DFT results determine model type(ARIMA or SARIMAx)...Eget felis eget nunc lobortis mattis. Vulputate sapien nec sagittis aliquam malesuada bibendum. Adipiscing tristique risus nec feugiat in fermentum posuere urna. Sapien pellentesque habitant morbi tristique senectus et netus. Faucibus scelerisque eleifend donec pretium vulputate sapien nec. Eget egestas purus viverra
from statsmodels.tsa.seasonal import seasonal_decompose
result = seasonal_decompose(byDate['Weekly_Sales'], model='add')
result.plot();
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.graphics.tsaplots import plot_acf,plot_pacf # for determining (p,q) orders
from pmdarima import auto_arima
models = byDate[['Weekly_Sales', 'IsHoliday']]
models.IsHoliday = byDate.IsHoliday.astype('bool').astype('int')
models.head()
models.index.freq = 'W-FRI'
models.index
#confirm stationarity
adfuller(models['Weekly_Sales'], maxlag=55)
#Run pmdarima.auto_arima to obtain recommended orders
auto_arima(models['Weekly_Sales'],seasonal=True,m=52).summary()
#Split the data into train/test setsdates = models.index
length = len(models)
train_len = int(length*0.70)
train = models.iloc[:train_len]
test = models.iloc[train_len:]
#Fit a SARIMA(1,0,0)(2,0,0,7) Model
model1 = SARIMAX(train['Weekly_Sales'],order=(2,0,2),seasonal_order=(1,0,0,52),enforce_invertibility=False)
results = model1.fit()
results.summary()
# Obtain predicted values
start=len(train)
end=len(train)+len(test)-1
predictions1 = results.predict(start=start, end=end, dynamic=False)
models['NoHolidays'] = predictions1
models.dropna(inplace=True)
models.head()
models_lines = go.Figure()
models_lines.add_trace(
go.Scatter(
x=models.index,
y=models.Weekly_Sales,
name = "Weekly Sales - Actual"
))
models_lines.add_trace(
go.Scatter(
x=models.index,
y=models.NoHolidays,
name = "Model 1 - No Holidays"
))
models_lines.show()
from statsmodels.tools.eval_measures import mse,rmse
mse1 = mse(test['Weekly_Sales'], predictions1)
rmse1 = rmse(test['Weekly_Sales'], predictions1)
print(f"MSE: {mse1}\nRMSE: {rmse1}")
add the exog variable
model2 = SARIMAX(train['Weekly_Sales'],exog=train['IsHoliday'],order=(2,0,2),seasonal_order=(1,0,0,52),enforce_invertibility=False)
results = model2.fit()
results.summary()
# Obtain predicted values
start=len(train)
end=len(train)+len(test)-1
exog_forecast = test[['IsHoliday']] # requires two brackets to yield a shape of (35,1)
predictions2 = results.predict(start=start, end=end, exog=exog_forecast)
models['WalmartHolidays'] = predictions2
models.head()
models_lines.add_trace(
go.Scatter(
x=models.index,
y=models.WalmartHolidays,
name = "Model 2 - with Walmart Holidays"
))
models_lines.show()
testmin = test.index.min()
testmax = test.index.max()
trainmin = train.index.min()
trainmax = train.index.max()
print(f"Train Set: {trainmin} - {trainmax}\nTest Set: {testmin} - {testmax}")
mse2 = mse(test['Weekly_Sales'], predictions2)
rmse2 = rmse(test['Weekly_Sales'], predictions2)
print(f"MSE: {mse2}\nRMSE: {rmse2}")
add the exog variable
byDatex = sales_all.groupby('Date').sum()
byDatex.IsHoliday = byDatex.IsHoliday.astype('bool').astype('int')
byDatex.head()
#Split the data into train/test setsDatexs = byDatex.index
length = len(byDatex)
train_len = int(length*0.70)
train = byDatex.iloc[:train_len]
test = byDatex.iloc[train_len:]
model3 = SARIMAX(train['Weekly_Sales'],exog=train['IsHoliday'],order=(2,0,2),seasonal_order=(1,0,0,52),enforce_invertibility=False)
results = model3.fit()
results.summary()
# Obtain predicted values
start=len(train)
end=len(train)+len(test)-1
exog_forecast = test[['IsHoliday']] # requires two brackets to yield a shape of (35,1)
predictions3 = results.predict(start=start, end=end, exog=exog_forecast)
models['AllHolidays'] = predictions3
models.head()
models_lines.add_trace(
go.Scatter(
x=models.index,
y=models.AllHolidays,
name = "Model 3 - All Holidays"
))
models_lines.show()
from statsmodels.tools.eval_measures import mse,rmse
mse3 = mse(test['Weekly_Sales'], predictions3)
rmse3 = rmse(test['Weekly_Sales'], predictions3)
print(f"MODEL 1\n MSE: {mse1}\n RMSE: {rmse1}\n")
print(f"MODEL 2\n MSE: {mse2}\n RMSE: {rmse2}\n")
print(f"MODEL 3\n MSE: {mse3}\n RMSE: {rmse3}")
original data only include...incorporate all US consumer spending event (super bowl, black friday) and other commonly observed holdays that would drive consumer spending...assuming all us stores!!!! back to school...no exact date???
SOME HOLIDAYS HAVE TRAILING SALES (PREPARE)...SOME SAME DAY...
Retain on the all data on Model 1
byDate = byDate[['Weekly_Sales','IsHoliday']]
byDate.head()
csv = pd.read_csv("data/fcast.csv")
fcast_dates = pd.DataFrame(csv)
fcast_dates.IsHoliday = fcast_dates.IsHoliday.astype('bool').astype('int')
fcast_dates.Date = pd.to_datetime(fcast_dates.Date)
fcast_dates.head()
fcast_dates.set_index('Date', inplace=True)
fcast_dates.index.freq = 'W-FRI'
fcast_dates.index
byDate.index
len(fcast_dates)
len(test)
model2x = SARIMAX(byDate['Weekly_Sales'],order=(2,0,2),seasonal_order=(1,0,0,52),enforce_invertibility=False)
results = model2x.fit()
exog_forecast = fcast_dates[['IsHoliday']]
fcast_predics = results.predict(fcast_dates.index.min(),fcast_dates.index.max(),exog=exog_forecast)
fcast_dates['Weekly_Sales'] = fcast_predics
fcast_dates['SalesType'] = "Forecast"
fcast_dates.head()
byDate['SalesType'] = "Actual"
byDate.head()
byDateFcast = pd.concat([byDate,fcast_dates])
byDateFcast
fcast_line = px.line(byDateFcast, x=byDateFcast.index,y='Weekly_Sales', color='SalesType')
fcast_line.show()